# Discretion in Clinical Decision-Making: Evidence from Bunching
# Claire Boone
# Identify the patient sample and cleans EHR data
# Creates table A1 (partial)


# set up -----------------
  library(tidyverse)
  library(magrittr)
  library(ggpubr)
  library(here)
  
  proj_dir <- here::here()
  raw_data <- file.path(proj_dir, "raw_data/")
  gen_data <- file.path(proj_dir, "gen_data/")
  out <- file.path(proj_dir, "out/")

# load data ---------------------------
  load(paste0(gen_data, "ehr_cleaned.Rda"))
  
  # for table A1
  n_distinct(hd$id) 
  n_distinct(hd$cod_est)
  
  
# initial cleaning ---------------------------------
    hd$diag_cv[hd$diag_cv == "na"] <- NA
    hd$diag_htn <- as.numeric(grepl("1", hd$diag_cv, fixed=TRUE))
    hd$diag_dm <- as.numeric(grepl("2", hd$diag_cv, fixed=TRUE))
    hd$diag_dislip <- as.numeric(grepl("3", hd$diag_cv, fixed=TRUE))

    hd$diag_htn <- ifelse(is.na(hd$diag_cv), NA, hd$diag_htn)
    hd$diag_dm <- ifelse(is.na(hd$diag_cv), NA, hd$diag_dm)
    hd$diag_dislip <- ifelse(is.na(hd$diag_cv), NA, hd$diag_dislip)

    hd %<>% group_by(id) %>% arrange(fecha_atencion) %>% mutate(rows_n_ehr = row_number(), rows_n = row_number()) %>% ungroup()
    hd %<>% filter(!is.na(pa_sys))  
    # for table A1
    n_distinct(hd$id) 
    n_distinct(hd$cod_est)

# merge with raw medication dataset ------------------------
  load(paste0(raw_data, "raw_fad.Rda"))
  fad <- raw_fad %>% mutate_all(tolower)
  names(fad) <- tolower(names(fad))

  fad %<>% arrange(id, fec_emi_receta) %>% group_by(id, med_htn) %>% mutate(rows_htn_med = row_number())
  fad %<>% filter(rows_htn_med == 1 & med_htn == 1)

  hd <- left_join(hd, fad, by='id')
  hd %<>% mutate(first_htn_med = ifelse(is.na(med_htn), 0 , med_htn))

# clean medication prescription variables ----------------------  
  hd %<>% arrange(id, fecha_atencion) %>% group_by(id, diag_htn) %>% mutate(rows_htn = row_number()) %>% ungroup()
  hd %<>% mutate(first_htn_diag = ifelse(diag_htn == 1 & rows_htn == 1, fecha_atencion, NA))
  hd$first_htn_diag <- as.Date(first_htn_diag, origin="1970-01-01")

  hd %<>% group_by(id) %>% mutate(first_htn_diag = max(first_htn_diag, na.rm=TRUE)) %>% ungroup()
  hd %<>% mutate(diag_htn = ifelse(fecha_atencion >= first_htn_diag & diag_htn != 1, 1, diag_htn))
  hd %<>% mutate(diag_htn = replace_na(diag_htn, 0))

  hd %<>% mutate(min_diag_med = pmin(fecha_atencion, fec_emi_receta, na.rm=TRUE))
  hd$min_diag_med <- as.Date(hd$min_diag_med)
  hd %<>% group_by(id) %>% mutate(min_diag_med = min(min_diag_med, na.rm=TRUE)) %>% ungroup()

# select patients for main analysis -----------------------  
  hd %<>% dplyr::arrange(id, fecha_atencion) %<>% 
    dplyr::group_by(id, diag_htn) %<>% dplyr::mutate(rows_htn = row_number()) %>% ungroup()
  hd %<>% dplyr::arrange(id, fecha_atencion) %<>% 
    dplyr::group_by(id) %<>% dplyr::mutate(rows_n = row_number()) %>% ungroup()
  hd %<>% arrange(id, fecha_atencion) %<>% group_by(id) %<>% mutate(ever_htn = max(diag_htn))  %>% ungroup()
  
  hd %<>% dplyr::mutate(tag1 = ifelse((rows_n ==1 & diag_htn ==1), 1, 0)) 
  
  hd %<>% mutate(tag2 = ifelse((ever_htn==1 & tag1!=1 & diag_htn==0), 1, 0)) 
  hd %<>% arrange(id, fecha_atencion) %<>% group_by(id) %<>% mutate(tag2max = max(tag2)) %<>% ungroup()
  hd %<>% mutate(tag2 = ifelse((ever_htn==1 & tag1!=1 & diag_htn==1 & rows_htn==1 & tag2max==1), 1, tag2))
  
  hd %<>% mutate(tag3 = ifelse(ever_htn==0, 1, 0))
  
  hd %<>% mutate(tag = ifelse((tag1==1 | tag2==1 | tag3==1), 1, 0))
  hd$tag_o <- 0
  hd %<>% mutate(tag_o = ifelse(tag1==1, 1, tag_o))
  hd %<>% mutate(tag_o = ifelse(tag2==1, 1, tag_o))
  hd %<>% mutate(tag_o = ifelse(tag3==1, 1, tag_o))
 
  # for table A1
  n_distinct(hd$id) 
  n_distinct(hd$cod_est)
  
# keep only tagged patients --------
  hd %<>% filter(tag == 1)
  hd %<>% group_by(id) %>% mutate(latest_visit = max(fecha_atencion)) %>% ungroup()
  hd$fec_emi_receta <- as.Date(ifelse(hd$fec_emi_receta > hd$latest_visit, NA, hd$fec_emi_receta))
  hd %<>% mutate(med_htn_ed = ifelse(!is.na(fec_emi_receta), 1, 0))

  hd %<>% mutate(remove = ifelse(fec_emi_receta < as.Date("2013-01-02"), 1, 0))
  hd %<>% filter(remove != 1)

# save ---------------------------------------------
  save(hd, file = paste0(gen_data,"ehr_cleaned_analysis_sample.Rda"))